1 Setup

1.1 Packages

library(tidyverse)
library(cowplot)
library(GGally)
library(heatmaply)
library(sva)
library(limma)
library(broom)
library(ggridges)

1.2 Data

1.2.1 Metabolite Abundances

# Cells #
fa.cell.neg.raw <- read_csv("./data/abundances/fa_1and2_cells_target_negmode.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character(),
  Treatment = col_character(),
  Plate = col_character(),
  Experiment = col_character()
)
See spec(...) for full column specifications.
fa.cell.pos.raw <- read_csv("./data/abundances/fa_1and2_cells_target_posmode.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character(),
  Treatment = col_character(),
  Plate = col_character(),
  Experiment = col_character()
)
See spec(...) for full column specifications.
# Media #
fa.med.neg.raw <- read_csv("./data/abundances/fa_1and2_media_target_negmode.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character(),
  Treatment = col_character(),
  Plate = col_character(),
  Experiment = col_character()
)
See spec(...) for full column specifications.
fa.med.pos.raw <- read_csv("./data/abundances/fa_1and2_media_target_posmode.csv")
Parsed with column specification:
cols(
  .default = col_double(),
  Samples = col_character(),
  Mode = col_character(),
  Type = col_character(),
  Group = col_character(),
  Treatment = col_character(),
  Plate = col_character(),
  Experiment = col_character()
)
See spec(...) for full column specifications.

1.2.2 Compound Information

# Cells #
fa.cell.neg.compound.info <- read_csv("./data/compound_info/fa_1and2_cells_target_negmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)
fa.cell.pos.compound.info <- read_csv("./data/compound_info/fa_1and2_cells_target_posmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)
# Media # 
fa.med.neg.compound.info <- read_csv("./data/compound_info/fa_1and2_media_target_negmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)
fa.med.pos.compound.info <- read_csv("./data/compound_info/fa_1and2_media_target_posmode_cmpnd_info.csv")
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  formula = col_character(),
  mass = col_double(),
  rt = col_double(),
  cas_id = col_character()
)
# Kegg/other ID reference #
cmpnd.id.db <- read_csv("./data/other/anp_db_compound_kegg.csv")
Parsed with column specification:
cols(
  compound_full = col_character(),
  cas_id = col_character(),
  HMDB = col_character(),
  Metlin = col_character(),
  KEGG = col_character()
)
# Other sample info
fa.cell.other.info <- read_csv("./data/other/fa_exp1_other_info.csv")
Parsed with column specification:
cols(
  Samples = col_character(),
  prot_conc_ug_uL = col_double(),
  run_order = col_integer()
)

1.3 Functions

MissingPerSamplePlot <- function(raw.data, start.string) {
  # Counts the number of missing/NA values per sample and
  # percent compounds missing out of total number of compounds per sample
  # Then passes the result into a vertical bar plot, where each 
  # bar represents a single sample and the size of the bar 
  # is the % of compounds missing
  
  counted.na <- raw.data %>%
  select(starts_with(start.string)) %>% 
  mutate(
    count.na = apply(., 1, function(x) sum(is.na(x))),
    percent.na = (count.na / ncol(raw.data %>% select(starts_with(start.string)))) * 100
    ) %>%
  dplyr::select(count.na, percent.na) %>%
  bind_cols(
    raw.data %>% 
      select(Samples, Group)
      ) %>% 
  arrange(percent.na) %>% 
  mutate(f.order = factor(Samples, levels = Samples))
counted.na %>% 
  ggplot(aes(x = f.order, y = percent.na, fill = Group)) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = 20, color = "gray", size = 1, alpha = 0.8) +
  coord_flip()+
  xlab("Samples") +
  ylab("Percent missing values in sample") +
  theme(axis.text.y = element_text(size = 6)) 
}
MissingPerCompound <- function(raw.data, start.string){
  # Function to count in how many experimental samples each compound is missing
  # Also calculates the % missing:
  # (# NA per compound across all experimental samples) * 100 / (tot num of samples)
  
  raw.data %>% 
  filter(Group == "sample") %>% 
  select(Samples, starts_with(start.string)) %>% 
  gather(key = "Compound", value = "Abundance", -Samples) %>% 
  group_by(Compound) %>% 
  summarise(
    na_count = sum(is.na(Abundance)),
    n_samples = n(),
    percent_na = (na_count * 100 / n_samples)
    ) %>% 
  filter(na_count > 0) %>% 
  arrange(desc(percent_na))
}
ReplaceNAwMinLogTransformSingle <- function(raw.dataframe, start.prefix) {
  # Function to replace any NAs in each column with the minimum for that column, 
  # separately for each sample type.
  # NA in negative control samples are replaced by 2.
  # Then data is log2 transformed
  
  smpls <- raw.dataframe %>%
    filter(Group == "sample") %>% 
    dplyr::select(starts_with(start.prefix))
  smpls.min <- lapply(smpls, min, na.rm = TRUE)
  # replace the missing values in the real samples with the minimum of the samples
  # then take the log
  smpls.noNA <- raw.dataframe  %>%
    filter(Group == "sample") %>% 
    dplyr::select(Samples:Experiment) %>%
    bind_cols(
      smpls %>%
        replace_na(replace = smpls.min) %>% 
        log2()
      )
  # QC
  QC <- raw.dataframe %>%
    filter(Group == "QC") %>% 
    dplyr::select(starts_with(start.prefix))
  QC.min <- lapply(QC, min, na.rm = TRUE)
  # replace the missing values in the QC with the minimum of the QC
  # then take the log
  QC.noNA <- raw.dataframe  %>%
    filter(Group == "QC") %>% 
    dplyr::select(Samples:Experiment) %>%
    bind_cols(
      QC %>%
        replace_na(replace = QC.min) %>% 
        log2()
      )
  # replace the missing values in solv and empty samples with 2 - for PCA analysis
  # then take the log
  other.min <- setNames(
    as.list(
      rep(2, ncol(
        raw.dataframe %>% 
          dplyr::select(starts_with(start.prefix))))
      ),
    colnames(raw.dataframe %>% dplyr::select(starts_with(start.prefix)))
                )
  other.num.log <- raw.dataframe %>%
    filter(Group != "sample" & Group != "QC") %>% 
    dplyr::select(Samples:Experiment) %>%
    bind_cols(
      raw.dataframe %>% 
        filter(Group != "sample" & Group != "QC") %>% 
        dplyr::select(starts_with(start.prefix)) %>%
        replace_na(replace = other.min) %>% 
        log2()
      )
  # combine them together back into one data frame
  all.noNA <- smpls.noNA %>% 
    bind_rows(QC.noNA) %>% 
    bind_rows(other.num.log)
}
HeatmapPrepAlt <- function(raw.data, start.prefix){
  # function for preparing dara for heatmap viz
  x <- raw.data %>% 
    select(starts_with(start.prefix)) %>% 
    scale(center = TRUE, scale = TRUE) %>% 
    as.matrix() 
  row.names(x) <- raw.data$Samples
  return(x)
}

2 Data Exploration

2.1 Mass vs Retention Time

Q: What are the distributions of compound masses and retention times?

# bind all 4 compound info df into 1 
full.fa.cmpnd <- fa.cell.neg.compound.info %>% 
  mutate(Set = "Cells / Neg") %>% 
  bind_rows(fa.cell.pos.compound.info %>% mutate(Set = "Cells / Pos")) %>% 
  bind_rows(fa.med.neg.compound.info %>% mutate(Set = "Media / Neg")) %>%
  bind_rows(fa.med.pos.compound.info %>% mutate(Set = "Media / Pos"))
full.fa.cmpnd %>% 
  ggplot(aes(x = rt, y = mass)) +
  geom_point(size = 3, alpha = 0.3) +
  xlab("Retention Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nFA Only Exp") +
  ylim(0, 1000)

full.fa.cmpnd %>% 
  ggplot(aes(x = rt, y = mass, color = Set)) +
  geom_point(size = 3, alpha = 0.5) +
  xlab("Retnetion Time (min)") +
  ylab("Mass (Da)") +
  ggtitle("Mass v RT\nFA Only Exp") +
  facet_wrap(~ Set) +
  ylim(0, 1000)

Q: Which compounds were found in one or more of the data types?

### FA cell join ###
fa.cell.cmpnd.join <- fa.cell.neg.compound.info %>% 
  inner_join(fa.cell.pos.compound.info, by = "cas_id", suffix = c(".c.n", ".c.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / cells 
print(fa.cell.cmpnd.join$compound_full.c.n)
  [1] "Glycine"                                    
  [2] "Alanine"                                    
  [3] "Sarcosine"                                  
  [4] "2-Aminobutyric Acid"                        
  [5] "GABA"                                       
  [6] "BAIBA"                                      
  [7] "Serine"                                     
  [8] "Hypotaurine"                                
  [9] "Uracil"                                     
 [10] "Proline"                                    
 [11] "Valine"                                     
 [12] "Threonine"                                  
 [13] "Cysteine"                                   
 [14] "Taurine"                                    
 [15] "4-Oxoproline"                               
 [16] "trans-4-Hydroxyproline"                     
 [17] "Creatine"                                   
 [18] "Isoleucine"                                 
 [19] "Leucine"                                    
 [20] "Asparagine"                                 
 [21] "Aspartic Acid"                              
 [22] "Adenine"                                    
 [23] "Hypoxanthine"                               
 [24] "Urocanic Acid"                              
 [25] "O-Phosphoethanolamine"                      
 [26] "Glutamine"                                  
 [27] "Lysine"                                     
 [28] "Glutamic Acid"                              
 [29] "Methionine"                                 
 [30] "2-Phenylglycine"                            
 [31] "Xanthine"                                   
 [32] "Ribitol"                                    
 [33] "3-Sulfinoalanine"                           
 [34] "3-Hydroxyanthranilic Acid"                  
 [35] "Histidine"                                  
 [36] "Orotic Acid"                                
 [37] "Allantoin"                                  
 [38] "N-Methylglutamic Acid"                      
 [39] "Phenylalanine"                              
 [40] "Pyridoxine"                                 
 [41] "DHAP"                                       
 [42] "G3P"                                        
 [43] "Glycerol 1-Phosphate / Glycerol 3-Phosphate"
 [44] "Arginine"                                   
 [45] "N-Carbamoyl-L-aspartic Acid"                
 [46] "Tyrosine"                                   
 [47] "D-Galactitol"                               
 [48] "N-alpha-Acetylglutamine"                    
 [49] "Glucuronic Acid"                            
 [50] "Tryptophan"                                 
 [51] "Phosphocreatine"                            
 [52] "Glycerylphosphorylethanolamine"             
 [53] "O-Succinylhomoserine"                       
 [54] "Pantothenic Acid"                           
 [55] "GlcNAc"                                     
 [56] "Cystathionine"                              
 [57] "Deoxycytidine"                              
 [58] "Cytidine"                                   
 [59] "Uridine"                                    
 [60] "Palmitoleic Acid"                           
 [61] "Glycerol 1-phosphoserine"                   
 [62] "D-Glucose 6-phosphate"                      
 [63] "Thiamine"                                   
 [64] "Adenosine"                                  
 [65] "Inosine"                                    
 [66] "10Z-Heptadecenoic Acid"                     
 [67] "Myoinositol-methyl-phosphate"               
 [68] "Oleic Acid"                                 
 [69] "Guanosine"                                  
 [70] "Xanthosine"                                 
 [71] "1-Monomyristin"                             
 [72] "2,3-cyclic CMP"                             
 [73] "Glutathione"                                
 [74] "Neu5Ac"                                     
 [75] "CMP"                                        
 [76] "UMP"                                        
 [77] "3-Phosphoglyceroinositol"                   
 [78] "AMP"                                        
 [79] "GMP"                                        
 [80] "Succinoadenosine"                           
 [81] "SAH"                                        
 [82] "CDP"                                        
 [83] "ADP"                                        
 [84] "GDP"                                        
 [85] "LysoPE(18:0)"                               
 [86] "CTP"                                        
 [87] "UTP"                                        
 [88] "CDP-Choline"                                
 [89] "ATP"                                        
 [90] "GTP"                                        
 [91] "GDP-Glucose"                                
 [92] "UDP-GalNAc"                                 
 [93] "UDP-GlcNAc"                                 
 [94] "GSSG"                                       
 [95] "NAD"                                        
 [96] "NADH"                                       
 [97] "NADP"                                       
 [98] "CoA"                                        
 [99] "PC(36:4)"                                   
[100] "PC(36:4)"                                   
# percent of cell / neg compounds found in cell / pos 
round(nrow(fa.cell.cmpnd.join) * 100 / nrow(fa.cell.neg.compound.info), 1)
[1] 47.4
# percent of cell / neg compounds found in cell / pos 
round(nrow(fa.cell.cmpnd.join) * 100 / nrow(fa.cell.pos.compound.info), 1)
[1] 66.2
### FA media join ###
fa.med.cmpnd.join <- fa.med.neg.compound.info %>% 
  inner_join(fa.med.pos.compound.info, by = "cas_id", suffix = c(".m.n", ".m.p")) %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
# compound names - found in pos and neg mode / media
print(fa.med.cmpnd.join$compound_full.m.n)
 [1] "Glycine"                  "Alanine"                 
 [3] "Serine"                   "Uracil"                  
 [5] "Creatinine"               "Proline"                 
 [7] "Valine"                   "Threonine"               
 [9] "Taurine"                  "4-Oxoproline"            
[11] "Isoleucine"               "Leucine"                 
[13] "Asparagine"               "Hypoxanthine"            
[15] "Glutamine"                "Glutamic Acid"           
[17] "Methionine"               "Xanthine"                
[19] "Allantoin"                "Phenylalanine"           
[21] "Uric Acid"                "Pyridoxine"              
[23] "Tyrosine"                 "D-Galactitol"            
[25] "Phenylacetylglycine"      "Cysteine-S-sulfonic Acid"
[27] "Pantothenic Acid"         "Uridine"                 
[29] "Folic Acid"              
# percent of media / neg compounds found in media / pos
round(nrow(fa.med.cmpnd.join) * 100 / nrow(fa.med.neg.compound.info), 1)
[1] 46.8
# percent of media / pos compounds found in media / neg
round(nrow(fa.med.cmpnd.join) * 100 / nrow(fa.med.pos.compound.info), 1)
[1] 43.3
### FA all sets join ###
fa.all.cmpnd.join <- fa.cell.cmpnd.join %>% 
  inner_join(fa.med.cmpnd.join, by = "cas_id") %>% 
  select(
    contains("cas_id"), contains("short"), 
    contains("full"), starts_with("formula"), 
    starts_with("mass"), starts_with("rt")
    )
nrow(fa.all.cmpnd.join)
[1] 24
# check for any mass issues between all 4 modes 
fa.all.cmpnd.join %>% 
  select(contains("mass")) %>% 
  ggpairs()

# RT issues?
fa.all.cmpnd.join %>% 
  select(starts_with("rt")) %>% 
  ggpairs()

2.2 Missing Values

Q: Do any of the samples have greater than 20% missing (NA) compound abundances, out of all of the features in the dataset?

MissingPerSamplePlot(fa.cell.neg.raw, "FAnC") +
  ggtitle("Missing Per Sample\nFA-only / Cells / Neg Mode")

MissingPerSamplePlot(fa.cell.pos.raw, "FApC") +
  ggtitle("Missing Per Sample\nFA-only / Cells / Pos Mode")

MissingPerSamplePlot(fa.med.neg.raw, "FAnM") +
  ggtitle("Missing Per Sample\nFA-only / Media / Neg Mode")

MissingPerSamplePlot(fa.med.pos.raw, "FApM") +
  ggtitle("Missing Per Sample\nFA-only / Media / Pos Mode")

(fa.cell.neg.cmpnd.to.excl <- MissingPerCompound(fa.cell.neg.raw, "FAnC") %>% 
  filter(percent_na > 20))
# A tibble: 1 x 4
  Compound na_count n_samples percent_na
  <chr>       <int>     <int>      <dbl>
1 FAnC52          7        24       29.2
(fa.cell.pos.cmpnd.to.excl <- MissingPerCompound(fa.cell.pos.raw, "FApC") %>% 
  filter(percent_na > 20))
# A tibble: 1 x 4
  Compound na_count n_samples percent_na
  <chr>       <int>     <int>      <dbl>
1 FApC23          8        24       33.3
MissingPerCompound(fa.med.neg.raw, "FAnM") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>
MissingPerCompound(fa.med.pos.raw, "FApM") %>% 
  filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
#   percent_na <dbl>

2.3 Negative Controls and Compound Elimination

2.3.1 Cells / Negative Mode

fa.cell.neg.raw.grp.mean <- fa.cell.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("FAnC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
fa.cell.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nFA-only / Cells / Negative Mode\nGrouped by sample type")

fa.cell.neg.raw.grp.mean.order <- fa.cell.neg.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
fa.cell.neg.raw %>% 
  select(Samples, Group, starts_with("FAnC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = fa.cell.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nFA-only / Cells / Negative Mode")

fa.cell.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nFA-only / Cells / Negative Mode")

fa.cell.neg.raw.grp.diff <- fa.cell.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(smpl_solv_diff = sample / solv)
fa.cell.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
fa.cell.neg.cmpnd.to.incl <- fa.cell.neg.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))  %>% 
  filter(!(Compound %in% fa.cell.neg.cmpnd.to.excl$Compound))
# how many compound were there in the raw dataset?
nrow(fa.cell.neg.raw.grp.diff)
[1] 211
# how many compounds are retained for further analysis?
nrow(fa.cell.neg.cmpnd.to.incl)
[1] 204

2.3.2 Cells / Positive Mode

fa.cell.pos.raw.grp.mean <- fa.cell.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("FApC")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
fa.cell.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nFA-only / Cells / Positive Mode\nGrouped by sample type")

fa.cell.pos.raw.grp.mean.order <- fa.cell.pos.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
fa.cell.pos.raw %>% 
  select(Samples, Group, starts_with("FApC")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 1) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = fa.cell.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nFA-only / Cells / Positive Mode")

fa.cell.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nFA-only / Cells / Positive Mode")

fa.cell.pos.raw.grp.diff <- fa.cell.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(smpl_solv_diff = sample / solv)
fa.cell.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 50)

fa.cell.pos.cmpnd.to.incl <- fa.cell.pos.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>% 
  filter(!(Compound %in% fa.cell.pos.cmpnd.to.excl$Compound))
# compounds in the original set
nrow(fa.cell.pos.raw.grp.diff)
[1] 151
# compounds left in analysis
nrow(fa.cell.pos.cmpnd.to.incl)
[1] 145

2.3.3 Media / Negative Mode

fa.med.neg.raw.grp.mean <- fa.med.neg.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("FAnM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
fa.med.neg.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nFA-only / Media / Negative Mode\nGrouped by sample type")

fa.med.neg.raw.grp.mean.order <- fa.med.neg.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
fa.med.neg.raw %>% 
  select(Samples, Group, starts_with("FAnM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = fa.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 2) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = fa.med.neg.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = fa.med.neg.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nFA-only / Media / Negative Mode")

fa.med.neg.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = fa.med.neg.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nFA-only / Media / Negative Mode")

fa.med.neg.raw.grp.diff <- fa.med.neg.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(smpl_solv_diff = sample / solv)
fa.med.neg.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

fa.med.neg.cmpnd.to.incl <- fa.med.neg.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
# original number
nrow(fa.med.neg.raw.grp.diff)
[1] 62
# number of metabolites after filtering
nrow(fa.med.neg.cmpnd.to.incl)
[1] 48

2.3.4 Media / Positive Mode

fa.med.pos.raw.grp.mean <- fa.med.pos.raw %>% 
  group_by(Group) %>% 
  summarize_at(vars(matches("FApM")), mean, na.rm = TRUE) %>% 
  gather(key = "Compound", value = "Grp_mean_abun", -Group)
fa.med.pos.raw.grp.mean %>% 
  ggplot(aes(log2(Grp_mean_abun), color = Group)) +
  geom_density(size = 2, alpha = 0.8) +
  ggtitle("Distribution of compound means\nFA-only / Media / Positive Mode\nGrouped by sample type")

fa.med.pos.raw.grp.mean.order <- fa.med.pos.raw.grp.mean %>% 
  filter(Group == "sample") %>% 
  arrange(Grp_mean_abun)
fa.med.pos.raw %>% 
  select(Samples, Group, starts_with("FApM")) %>% 
  gather("Compound", value = "Abundance", -c(Samples, Group)) %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = fa.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) + 
  geom_line(alpha = 0.1, size = 2) + 
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  geom_line(
    data = fa.med.pos.raw.grp.mean %>% 
      mutate(Cmpnd_sort = factor(Compound, levels = fa.med.pos.raw.grp.mean.order$Compound)), 
    aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
    size = 0.5
    ) +
  ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nFA-only / Media / Positive Mode")

fa.med.pos.raw.grp.mean %>% 
  mutate(Cmpnd_sort = factor(Compound, levels = fa.med.pos.raw.grp.mean.order$Compound)) %>% 
  ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
  geom_point(size = 1, alpha = 0.8) +
  geom_line(alpha = 0.8) +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Compound") +
  ylab("log2(Sample Type Mean)") +
  ggtitle("Profile Plot of compound means by sample type only\nFA-only / Media / Positive Mode")

fa.med.pos.raw.grp.diff <- fa.med.pos.raw.grp.mean %>% 
  spread(Group, Grp_mean_abun) %>% 
  mutate(smpl_solv_diff = sample / solv)
fa.med.pos.raw.grp.diff %>% 
  ggplot(aes(log2(smpl_solv_diff))) +
  geom_histogram(bins = 25)

fa.med.pos.cmpnd.to.incl <- fa.med.pos.raw.grp.diff %>% 
  filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
# original number of metabolites
nrow(fa.med.pos.raw.grp.diff)
[1] 67
# number left after filtering
nrow(fa.med.pos.cmpnd.to.incl)
[1] 47

3 Data Prep and Preliminary Analysis

3.1 Cleanup

fa.cell.neg.noNA <- fa.cell.neg.raw %>% 
  select(Samples:Experiment, one_of(fa.cell.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("FAnC")
fa.cell.pos.noNA <- fa.cell.pos.raw %>% 
  select(Samples:Experiment, one_of(fa.cell.pos.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("FApC")
fa.med.neg.noNA <- fa.med.neg.raw %>% 
  select(Samples:Experiment, one_of(fa.med.neg.cmpnd.to.incl$Compound)) %>% 
  ReplaceNAwMinLogTransformSingle("FAnM")
fa.med.pos.noNA <- fa.med.pos.raw %>% 
  select(Samples:Experiment, one_of(fa.med.pos.cmpnd.to.incl$Compound)) %>%  
  ReplaceNAwMinLogTransformSingle("FApM")

3.2 Distribution plots

3.2.1 Cells / Negative Mode

fa.cell.neg.noNA.gathered <- fa.cell.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(fa.cell.neg.noNA) == "FAnC1"):ncol(fa.cell.neg.noNA)
    )
fa.cell.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nFA-only / Cells / Negative Mode")

fa.cell.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nFA-only / Cells / Negative Mode")
Picking joint bandwidth of 0.997

fa.cell.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nFA-only / Cells / Negative Mode")
Picking joint bandwidth of 0.931

fa.cell.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nFA-only / Cells / Negative Mode")

3.2.2 Cells / Positive Mode

fa.cell.pos.noNA.gathered <- fa.cell.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(fa.cell.pos.noNA) == "FApC1"):ncol(fa.cell.pos.noNA)
    )
fa.cell.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nFA-only / Cells / Positive Mode")

fa.cell.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nFA-only / Cells / Positive Mode")
Picking joint bandwidth of 1.03

fa.cell.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nFA-only / Cells / Positive Mode")
Picking joint bandwidth of 1.03

fa.cell.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nFA-only / Cells / Positive Mode")

3.2.3 Media / Negative Mode

fa.med.neg.noNA.gathered <- fa.med.neg.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(fa.med.neg.noNA) == "FAnM1"):ncol(fa.med.neg.noNA)
    )
fa.med.neg.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nFA-only / Media / Negative Mode")

fa.med.neg.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nFA-only / Media / Negative Mode")
Picking joint bandwidth of 0.872

fa.med.neg.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nFA-only / Media / Negative Mode")
Picking joint bandwidth of 0.866

fa.med.neg.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nFA-only / Media / Negative Mode")

3.2.4 Media / Positive Mode

fa.med.pos.noNA.gathered <- fa.med.pos.noNA %>% 
  gather(
    key = "Metabolite", "Abundance", 
    which(colnames(fa.med.pos.noNA) == "FApM1"):ncol(fa.med.pos.noNA)
    )
fa.med.pos.noNA.gathered %>% 
  ggplot(aes(Samples, Abundance, fill = Group)) +
  geom_boxplot() +
  geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
  theme(axis.text.x = element_text(angle = 90)) +
  ylab("log2(Abundance)") +
  ggtitle("Boxplot of compound abundances\nAll samples\nFA-only / Media / Postive Mode")

fa.med.pos.noNA.gathered %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Group)) + 
  geom_density_ridges(scale = 15) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nAll samples\nFA-only / Media / Positive Mode")
Picking joint bandwidth of 1.36

fa.med.pos.noNA.gathered %>% 
  filter(Group == "sample") %>% 
  ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) + 
  geom_density_ridges(scale = 10) +
  theme_ridges() +
  scale_y_discrete(expand = c(0.01, 0)) +
  ggtitle("Ridge plot of compound abundances\nExperimental samples only\nFA-only / Media / Positive Mode")
Picking joint bandwidth of 1.34

fa.med.pos.noNA.gathered %>%
  filter(Group == "sample") %>% 
  ggplot(aes(Abundance, group = Samples, color = Treatment)) +
  geom_density(alpha = 0.8, size = 0.75) +
  xlab("log2(Abundance)") +
  ggtitle("Density plot of compound abundances\nExperimental samples only\nFA-only / Media / Positive Mode")

3.3 Principal Component Analysis

Some plots to understand the relationship between the samples, QC samples, solvent, and empty samples in some cases.

3.3.1 Cells / Negative Mode

### PCA on all Samples ###
fa.cell.neg.full.pca <- fa.cell.neg.noNA %>% 
  select(starts_with("FAnC")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (fa.cell.neg.full.pca$sdev ^ 2) * 100 / sum(fa.cell.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nFA-only / Cells / Negative Mode",
  type = "b"
  )

fa.cell.neg.full.pca.x <- as.data.frame(fa.cell.neg.full.pca$x)
row.names(fa.cell.neg.full.pca.x) <- fa.cell.neg.noNA$Samples
fa.cell.neg.full.pca.x <- fa.cell.neg.full.pca.x %>% 
  bind_cols(fa.cell.neg.noNA %>% select(Group:Experiment))
fa.cell.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (92.3% Var)") +
  ylab("PC2 (3.7%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nFA-only / Cells / Negative Mode")

### Samples and QC ###
fa.cell.neg.smpl.QC.pca <- fa.cell.neg.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.cell.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(fa.cell.neg.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nFA-only / Cells / Negative Mode",
  type = "b"
  )

fa.cell.neg.smpl.QC.pca.x <- as.data.frame(fa.cell.neg.smpl.QC.pca$x)
fa.cell.neg.smpl.QC.pca.x <- fa.cell.neg.smpl.QC.pca.x %>% 
  bind_cols(
    fa.cell.neg.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(fa.cell.neg.smpl.QC.pca.x) <- fa.cell.neg.smpl.QC.pca.x$Samples
fa.cell.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (36.2% Var)") +
  ylab("PC2 (17.0%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Cells / Negative Mode")

fa.cell.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.9% Var)") +
  ylab("PC4 (7.9%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Cells / Negative Mode")

### Experimental Samples Only ###
fa.cell.neg.smpl.pca <- fa.cell.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.cell.neg.smpl.pca$sdev ^ 2) * 100 / sum(fa.cell.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nFA-only / Cells / Negative Mode",
  type = "b"
  )

fa.cell.neg.smpl.pca.x <- as.data.frame(fa.cell.neg.smpl.pca$x)
fa.cell.neg.smpl.pca.x <- fa.cell.neg.smpl.pca.x %>% 
  bind_cols(
    fa.cell.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(fa.cell.neg.smpl.pca.x) <- fa.cell.neg.smpl.pca.x$Samples
fa.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (34.7% Var)") +
  ylab("PC2 (20.3%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Cells / Negative Mode")

fa.cell.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.5% Var)") +
  ylab("PC4 (8.6%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Cells / Negative Mode")

3.3.2 Cells / Postive Mode

### PCA on all Samples ###
fa.cell.pos.full.pca <- fa.cell.pos.noNA %>% 
  select(starts_with("FApC")) %>% 
  # good idea to center data before pca, but scaling should not be necessary
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
# plot variance explained by each new principal component
plot(
  (fa.cell.pos.full.pca$sdev ^ 2) * 100 / sum(fa.cell.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nFA-only / Cells / Positive Mode",
  type = "b"
  )

fa.cell.pos.full.pca.x <- as.data.frame(fa.cell.pos.full.pca$x)
row.names(fa.cell.pos.full.pca.x) <- fa.cell.pos.noNA$Samples
fa.cell.pos.full.pca.x <- fa.cell.pos.full.pca.x %>% 
  bind_cols(fa.cell.pos.noNA %>% select(Group:Experiment))
fa.cell.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (89.5% Var)") +
  ylab("PC2 (3.4%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nFA-only / Cells / Positive Mode")

### Samples and QC ###
fa.cell.pos.smpl.QC.pca <- fa.cell.pos.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.cell.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(fa.cell.pos.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nFA-only / Cells / Positive Mode",
  type = "b"
  )

fa.cell.pos.smpl.QC.pca.x <- as.data.frame(fa.cell.pos.smpl.QC.pca$x)
fa.cell.pos.smpl.QC.pca.x <- fa.cell.pos.smpl.QC.pca.x %>% 
  bind_cols(
    fa.cell.pos.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(fa.cell.pos.smpl.QC.pca.x) <- fa.cell.pos.smpl.QC.pca.x$Samples
fa.cell.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (48.3% Var)") +
  ylab("PC2 (12.9%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Cells / Positive Mode")

fa.cell.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (12.3% Var)") +
  ylab("PC4 (8.4%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Cells / Positive Mode")

### Experimental Samples Only ###
fa.cell.pos.smpl.pca <- fa.cell.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.cell.pos.smpl.pca$sdev ^ 2) * 100 / sum(fa.cell.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nFA-only / Cells / Positive Mode",
  type = "b"
  )

fa.cell.pos.smpl.pca.x <- as.data.frame(fa.cell.pos.smpl.pca$x)
fa.cell.pos.smpl.pca.x <- fa.cell.pos.smpl.pca.x %>% 
  bind_cols(
    fa.cell.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(fa.cell.pos.smpl.pca.x) <- fa.cell.pos.smpl.pca.x$Samples
fa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (51.3% Var)") +
  ylab("PC2 (15.7%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Cells / Positive Mode")

fa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.1% Var)") +
  ylab("PC4 (6.8%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Cells / Positive Mode")

3.3.3 Media / Negative Mode

fa.med.neg.full.pca <- fa.med.neg.noNA %>% 
  select(starts_with("FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.med.neg.full.pca$sdev ^ 2) * 100 / sum(fa.med.neg.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nFA-only / Media / Negative Mode",
  type = "b"
  )

fa.med.neg.full.pca.x <- as.data.frame(fa.med.neg.full.pca$x)
row.names(fa.med.neg.full.pca.x) <- fa.med.neg.noNA$Samples
fa.med.neg.full.pca.x <- fa.med.neg.full.pca.x %>% 
  bind_cols(fa.med.neg.noNA %>% select(Group:Experiment))
fa.med.neg.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (94.0% Var)") +
  ylab("PC2 (2.2%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nFA-only / Media / Negative Mode")

### Samples and QC ###
fa.med.neg.smpl.QC.pca <- fa.med.neg.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.med.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(fa.med.neg.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nFA-only / Media / Negative Mode",
  type = "b"
  )

fa.med.neg.smpl.QC.pca.x <- as.data.frame(fa.med.neg.smpl.QC.pca$x)
fa.med.neg.smpl.QC.pca.x <- fa.med.neg.smpl.QC.pca.x %>% 
  bind_cols(
    fa.med.neg.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(fa.med.neg.smpl.QC.pca.x) <- fa.med.neg.smpl.QC.pca.x$Samples
fa.med.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (52.3% Var)") +
  ylab("PC2 (25.9%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Media / Negative Mode")

fa.med.neg.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (11.9% Var)") +
  ylab("PC4 (2.8%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Media / Negative Mode")

### Experimental Samples Only ###
fa.med.neg.smpl.pca <- fa.med.neg.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.med.neg.smpl.pca$sdev ^ 2) * 100 / sum(fa.med.neg.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nFA-only / Media / Negative Mode",
  type = "b"
  )

fa.med.neg.smpl.pca.x <- as.data.frame(fa.med.neg.smpl.pca$x)
fa.med.neg.smpl.pca.x <- fa.med.neg.smpl.pca.x %>% 
  bind_cols(
    fa.med.neg.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(fa.med.neg.smpl.pca.x) <- fa.med.neg.smpl.pca.x$Samples
fa.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (51.2% Var)") +
  ylab("PC2 (26.2%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Media / Negative Mode")

fa.med.neg.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (13.4% Var)") +
  ylab("PC4 (3.1%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Media / Negative Mode")

3.3.4 Media / Positive Mode

fa.med.pos.full.pca <- fa.med.pos.noNA %>% 
  select(starts_with("FApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.med.pos.full.pca$sdev ^ 2) * 100 / sum(fa.med.pos.full.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nAll samples only\nFA-only / Media / Positive Mode",
  type = "b"
  )

fa.med.pos.full.pca.x <- as.data.frame(fa.med.pos.full.pca$x)
row.names(fa.med.pos.full.pca.x) <- fa.med.pos.noNA$Samples
fa.med.pos.full.pca.x <- fa.med.pos.full.pca.x %>% 
  bind_cols(fa.med.pos.noNA %>% select(Group:Experiment))
fa.med.pos.full.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (87.9% Var)") +
  ylab("PC2 (6.0%)") +
  ggtitle("Principal Component Analysis\nAll Samples\nFA-only / Media / Positive Mode")

### Samples and QC ###
fa.med.pos.smpl.QC.pca <- fa.med.pos.noNA %>% 
  filter(Group == "sample" | Group == "QC") %>% 
  select(starts_with("FApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.med.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(fa.med.pos.smpl.QC.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nSamples and QC\nFA-only / Media / Positive Mode",
  type = "b"
  )

fa.med.pos.smpl.QC.pca.x <- as.data.frame(fa.med.pos.smpl.QC.pca$x)
fa.med.pos.smpl.QC.pca.x <- fa.med.pos.smpl.QC.pca.x %>% 
  bind_cols(
    fa.med.pos.noNA %>% 
      filter(Group == "sample" | Group == "QC") %>% 
      select(Samples, Group:Experiment)
  )
row.names(fa.med.pos.smpl.QC.pca.x) <- fa.med.pos.smpl.QC.pca.x$Samples
fa.med.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (62.02% Var)") +
  ylab("PC2 (11.2%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Media / Positive Mode")

fa.med.pos.smpl.QC.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (8.5% Var)") +
  ylab("PC4 (4.4%)") +
  ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Media / Positive Mode")

### Experimental Samples Only ###
fa.med.pos.smpl.pca <- fa.med.pos.noNA %>% 
  filter(Group == "sample") %>% 
  select(starts_with("FApM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(
  (fa.med.pos.smpl.pca$sdev ^ 2) * 100 / sum(fa.med.pos.smpl.pca$sdev ^ 2), 
  xlab = "Principal Component",
  ylab = "Variance Explained",
  main = "Percent variance explained by each principal component\nExperimental samples only\nFA-only / Media / Positive Mode",
  type = "b"
  )

fa.med.pos.smpl.pca.x <- as.data.frame(fa.med.pos.smpl.pca$x)
fa.med.pos.smpl.pca.x <- fa.med.pos.smpl.pca.x %>% 
  bind_cols(
    fa.med.pos.noNA %>% 
      filter(Group == "sample") %>% 
      select(Samples, Group:Experiment)
  )
row.names(fa.med.pos.smpl.pca.x) <- fa.med.pos.smpl.pca.x$Samples
fa.med.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (55.3% Var)") +
  ylab("PC2 (14.8%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Media / Positive Mode")

fa.med.pos.smpl.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (10.1% Var)") +
  ylab("PC4 (5.1%)") +
  ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Media / Positive Mode")

4 Batch Effects and Signifiance Testing

4.1 Cells / Negative Mode

# sample prep
fa.cell.neg.smpl.data <- fa.cell.neg.noNA %>% 
    filter(Group == "sample")
fa.cell.neg.data.for.sva <- as.matrix(
  fa.cell.neg.smpl.data[, which(
    colnames(fa.cell.neg.smpl.data) == "FAnC1"
    ):ncol(fa.cell.neg.smpl.data)]
  )
row.names(fa.cell.neg.data.for.sva) <- fa.cell.neg.smpl.data$Samples
fa.cell.neg.data.for.sva <- t(fa.cell.neg.data.for.sva)
# pheno prep
fa.cell.neg.data.pheno <- as.data.frame(fa.cell.neg.smpl.data[, 5:7])
row.names(fa.cell.neg.data.pheno) <- fa.cell.neg.smpl.data$Samples
# sva calculation
fa.cell.neg.mod.fa <- model.matrix(~ as.factor(Treatment), data = fa.cell.neg.data.pheno)
fa.cell.neg.mod0 <- model.matrix(~ 1, data = fa.cell.neg.data.pheno)
set.seed(2018)
num.sv(fa.cell.neg.data.for.sva, fa.cell.neg.mod.fa, method = "be")
[1] 4
set.seed(2018)
num.sv(fa.cell.neg.data.for.sva, fa.cell.neg.mod.fa, method = "leek")
[1] 3
set.seed(2018)
fa.cell.neg.sv <- sva(fa.cell.neg.data.for.sva, fa.cell.neg.mod.fa, fa.cell.neg.mod0)
Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  
# extract the surrogate variables
fa.cell.neg.surr.var <- as.data.frame(fa.cell.neg.sv$sv)
colnames(fa.cell.neg.surr.var) <- c("S1", "S2", "S3", "S4")


fa.cell.neg.covar <- fa.cell.neg.smpl.pca.x %>% 
  select(Samples, PC1:PC5) %>% 
  inner_join(
    cbind(fa.cell.neg.data.pheno, fa.cell.neg.surr.var) %>% 
      rownames_to_column("Samples"),
    by = "Samples"
    ) %>% 
  full_join(fa.cell.other.info, by = "Samples") %>% 
  as_tibble()

fa.cell.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, exp_plate:S4) %>% 
  gather("surr_var", "value", S1:S4) %>% 
  ggplot(aes(exp_plate, value, color = exp_plate)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1) +
  facet_wrap(~ surr_var)

fa.cell.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, Treatment:S4) %>% 
  gather("surr_var", "value", S1:S4) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ surr_var)

fa.cell.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples:exp_plate) %>% 
  gather("PC", "value", PC1:PC5) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ PC, scales = "free")

fa.cell.neg.smpl.pca.x %>% 
  ggplot(aes(PC1, PC2, color = Treatment, shape = Experiment)) + 
  geom_point(size = 4 , alpha = 0.8)

fa.cell.neg.covar %>% 
  select(PC1:PC5, S1:S4) %>% 
  ggcorr(label = TRUE)

fa.cell.neg.covar %>% 
  select(PC1:PC5, S1:S4) %>% 
  ggpairs()

fa.cell.neg.covar %>% 
  select(PC1:PC5, prot_conc_ug_uL:run_order) %>% 
  ggcorr(label = TRUE)

fa.cell.neg.covar %>% 
  select(PC1:PC5, prot_conc_ug_uL:run_order) %>% 
  ggpairs()

fa.cell.neg.covar %>%
  ggplot(aes(run_order, prot_conc_ug_uL, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.neg.covar %>% 
  ggplot(aes(run_order, PC1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC5, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.neg.covar %>% 
  select(S1:run_order) %>% 
  ggcorr(label = TRUE)

fa.cell.neg.covar %>% 
  select(S1:run_order) %>% 
  ggpairs()

fa.cell.neg.covar %>% 
  ggplot(aes(run_order, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.neg.covar %>% 
  ggplot(aes(run_order, S2, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S2, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

colnames(fa.cell.neg.mod.fa) <- c("cntrl", "FAvsCNTRL")
# combine the full model matrix and the surrogate variables into one
fa.cell.neg.d.sv <- cbind(fa.cell.neg.mod.fa, fa.cell.neg.surr.var[, 1:2])  
head(fa.cell.neg.d.sv)
          cntrl FAvsCNTRL          S1           S2
T1_P1_C1      1         0 -0.34666196 -0.098939211
T1_P1_C2      1         0 -0.25299878  0.153661274
T1_P1_C3      1         0 -0.21635550  0.061003369
T1_P1_FA1     1         1 -0.13405912  0.409568228
T1_P1_FA2     1         1 -0.29004084  0.001577421
T1_P1_FA3     1         1 -0.07911872  0.021463180
fa.cell.neg.top.table <- fa.cell.neg.data.for.sva %>% 
  # fit a linear model 
  lmFit(fa.cell.neg.d.sv) %>% 
  # calculate the test statistics
  eBayes() %>% 
  # select the top features that have a p-value < 0.05 after Bonferroni multiple hypothesis correction
  topTable(coef = "FAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(fa.cell.neg.data.for.sva))
# what the result looks like:
head(fa.cell.neg.top.table)  
             logFC  AveExpr         t      P.Value    adj.P.Val        B
FAnC186  1.2702923 11.14052  8.614618 1.361803e-08 2.778079e-06 9.553378
FAnC184  1.5893545 10.89884  8.435652 1.964230e-08 4.007030e-06 9.180335
FAnC182  1.0730828 14.18294  6.050035 3.878567e-06 7.912277e-04 3.810472
FAnC17  -0.4413784 20.59570 -5.048885 4.343561e-05 8.860864e-03 1.375521
# make result more informative
fa.cell.neg.top.w.info <- fa.cell.neg.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    fa_div_cntrl = 2 ^ logFC,
    change_in_fa = ifelse(fa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(fa.cell.neg.compound.info, by = "compound_short") %>% 
  filter(fa_div_cntrl > 1.2 | fa_div_cntrl < 0.83) %>%  
  arrange(change_in_fa, desc(fa_div_cntrl))
fa.cell.neg.top.w.info %>% 
  select(compound_short, compound_full, change_in_fa, fa_div_cntrl)
  compound_short compound_full change_in_fa fa_div_cntrl
1         FAnC17        Serine         down    0.7364307
2        FAnC184  5-Methyl-DHF           up    3.0091468
3        FAnC186 10-Formyl-DHF           up    2.4121043
4        FAnC182    Folic Acid           up    2.1039243
#write_csv(path = "./results/fa_cell_neg_top_hits_w_FC_pval_sv_edit.csv", fa.cell.neg.top.w.info)
fa.cell.neg.gathered <- fa.cell.neg.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(fa.cell.neg.surr.var) %>% 
  select(Samples, Treatment, S1:S2, starts_with("FAnC")) %>% 
  gather(key = "Compound", value = "Abundance", FAnC1:FAnC99)
# structure so far:
glimpse(fa.cell.neg.gathered)
Observations: 4,896
Variables: 6
$ Samples   <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_FA1", "T1...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "fa", "fa", "fa", "cntrl"...
$ S1        <dbl> -0.34666196, -0.25299878, -0.21635550, -0.13405912, ...
$ S2        <dbl> -0.098939211, 0.153661274, 0.061003369, 0.409568228,...
$ Compound  <chr> "FAnC1", "FAnC1", "FAnC1", "FAnC1", "FAnC1", "FAnC1"...
$ Abundance <dbl> 17.13956, 17.03877, 16.57158, 17.49617, 16.35237, 16...
fa.cell.neg.nested <- fa.cell.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  # apply a linear model to each individual compound, as a function of the surrogate variables
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>% 
  # use broom to tidy up the output
  mutate(augment_model = map(model, augment))
# result to far:
fa.cell.neg.nested
# A tibble: 204 x 4
   Compound data              model    augment_model     
   <chr>    <list>            <list>   <list>            
 1 FAnC1    <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
 2 FAnC10   <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
 3 FAnC100  <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
 4 FAnC101  <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
 5 FAnC102  <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
 6 FAnC103  <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
 7 FAnC104  <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
 8 FAnC105  <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
 9 FAnC106  <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
10 FAnC107  <tibble [24 x 5]> <S3: lm> <tibble [24 x 10]>
# ... with 194 more rows
# now to get the residuals out for each compound
fa.cell.neg.modSV.resid <- fa.cell.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  # return to long format
  spread(Compound, .resid) 
glimpse(fa.cell.neg.modSV.resid[, 1:5])
Observations: 24
Variables: 5
$ Samples   <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_FA1", "T1...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "fa", "fa", "fa", "cntrl"...
$ FAnC1     <dbl> -0.02968854, 0.01128112, -0.35142304, 0.66530704, -0...
$ FAnC10    <dbl> -0.1426195303, 0.0274206987, -0.4867439882, 0.773244...
$ FAnC100   <dbl> -0.0505930542, 0.0283496690, -0.0129931473, -0.08995...
fa.cell.neg.modSV.resid %>% 
  select(Samples, one_of(fa.cell.neg.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("FAnC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nFA-Only Experiment / Cells / Negative Mode",
    margins = c(50, 50, 75, 30),
    labRow = fa.cell.neg.top.w.info$compound_full,
    k_col = 2, k_row = 2
    )
### PCA - cleaned data ###
fa.cell.neg.modSV.pca <- fa.cell.neg.modSV.resid %>% 
  select(starts_with("FAnC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(fa.cell.neg.modSV.pca$sdev^ 2 * 100 / sum(fa.cell.neg.modSV.pca$sdev ^ 2))

fa.cell.neg.modSV.pca.x <- as.data.frame(fa.cell.neg.modSV.pca$x)
row.names(fa.cell.neg.modSV.pca.x) <- fa.cell.neg.modSV.resid$Samples
fa.cell.neg.modSV.pca.x <- fa.cell.neg.modSV.pca.x %>% 
  bind_cols(fa.cell.neg.modSV.resid %>% select(Treatment))
fa.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (25.5% Var)") +
  ylab("PC2 (19.0% Var)")

fa.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (13.2% Var)") +
  ylab("PC4 (9.5% Var)")

fa.cell.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC5, y = PC6, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC5 (8.8% Var)") +
  ylab("PC6 (5.9% Var)")

4.2 Cells / Positive Mode

fa.cell.pos.smpl.data <- fa.cell.pos.noNA %>% 
    filter(Group == "sample")
fa.cell.pos.data.for.sva <- as.matrix(
  fa.cell.pos.smpl.data[, which(
    colnames(fa.cell.pos.smpl.data) == "FApC1"
    ):ncol(fa.cell.pos.smpl.data)]
  )
row.names(fa.cell.pos.data.for.sva) <- fa.cell.pos.smpl.data$Samples
fa.cell.pos.data.for.sva <- t(fa.cell.pos.data.for.sva)
fa.cell.pos.data.pheno <- as.data.frame(fa.cell.pos.smpl.data[, 5:7])
row.names(fa.cell.pos.data.pheno) <- fa.cell.pos.smpl.data$Samples
fa.cell.pos.mod.fa <- model.matrix(~ as.factor(Treatment), data = fa.cell.pos.data.pheno)
fa.cell.pos.mod0 <- model.matrix(~ 1, data = fa.cell.pos.data.pheno)
set.seed(2018)
num.sv(fa.cell.pos.data.for.sva, fa.cell.pos.mod.fa, method = "be")
[1] 3
set.seed(2018)
num.sv(fa.cell.pos.data.for.sva, fa.cell.pos.mod.fa, method = "leek")
[1] 0
set.seed(2018)
fa.cell.pos.sv <- sva(fa.cell.pos.data.for.sva, fa.cell.pos.mod.fa, fa.cell.pos.mod0)
Number of significant surrogate variables is:  3 
Iteration (out of 5 ):1  2  3  4  5  
fa.cell.pos.surr.var <- as.data.frame(fa.cell.pos.sv$sv)
colnames(fa.cell.pos.surr.var) <- c("S1", "S2", "S3")

fa.cell.pos.covar <- fa.cell.pos.smpl.pca.x %>% 
  select(Samples, PC1:PC5) %>% 
  inner_join(
    cbind(fa.cell.pos.data.pheno, fa.cell.pos.surr.var) %>% 
      rownames_to_column("Samples"),
    by = "Samples"
    ) %>% 
  full_join(fa.cell.other.info, by = "Samples") %>% 
  as_tibble()

fa.cell.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, exp_plate:S3) %>% 
  gather("surr_var", "value", S1:S3) %>% 
  ggplot(aes(exp_plate, value, color = exp_plate)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1) +
  facet_wrap(~ surr_var, scales = "free")

fa.cell.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, Treatment:S3) %>% 
  gather("surr_var", "value", S1:S3) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ surr_var, scales = "free")

fa.cell.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples:exp_plate) %>% 
  gather("PC", "value", PC1:PC5) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ PC, scales = "free")

fa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(PC1, PC2, color = Treatment, shape = Experiment)) + 
  geom_point(size = 4 , alpha = 0.8)

fa.cell.pos.smpl.pca.x %>% 
  ggplot(aes(PC1, PC2, color = Treatment, shape = Plate)) + 
  geom_point(size = 4 , alpha = 0.8)

fa.cell.pos.covar %>% 
  select(PC1:PC5, S1:S3) %>% 
  ggcorr(label = TRUE)

fa.cell.pos.covar %>% 
  select(PC1:PC5, S1:S3) %>% 
  ggpairs()

fa.cell.pos.covar %>% 
  select(PC1:PC5, prot_conc_ug_uL:run_order) %>% 
  ggcorr(label = TRUE)

fa.cell.pos.covar %>% 
  select(PC1:PC5, prot_conc_ug_uL:run_order) %>% 
  ggpairs()

fa.cell.pos.covar %>%
  ggplot(aes(run_order, prot_conc_ug_uL, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.pos.covar %>% 
  ggplot(aes(run_order, PC1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC3, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.pos.covar %>% 
  select(S1:run_order) %>% 
  ggcorr(label = TRUE)

fa.cell.pos.covar %>% 
  select(S1:run_order) %>% 
  ggpairs()

fa.cell.pos.covar %>% 
  ggplot(aes(run_order, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.pos.covar %>% 
  ggplot(aes(run_order, S2, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.cell.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S3, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

colnames(fa.cell.pos.mod.fa) <- c("cntrl", "FAvsCNTRL")
fa.cell.pos.d.sv <- cbind(fa.cell.pos.mod.fa, fa.cell.pos.surr.var[, 1:2])  
head(fa.cell.pos.d.sv)
          cntrl FAvsCNTRL          S1          S2
T1_P1_C1      1         0 -0.31421328 -0.01485533
T1_P1_C2      1         0 -0.25463211 -0.16793123
T1_P1_C3      1         0 -0.18110404 -0.07376907
T1_P1_FA1     1         1 -0.21726705 -0.01420531
T1_P1_FA2     1         1 -0.25160936 -0.10623941
T1_P1_FA3     1         1 -0.09146465 -0.05419054
fa.cell.pos.top.table <- fa.cell.pos.data.for.sva %>% 
  lmFit(fa.cell.pos.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "FAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(fa.cell.pos.data.for.sva))
fa.cell.pos.top.w.info <- fa.cell.pos.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    fa_div_cntrl = 2 ^ logFC,
    change_in_fa = ifelse(fa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(fa.cell.pos.compound.info, by = "compound_short") %>% 
  filter(fa_div_cntrl > 1.2 | fa_div_cntrl < 0.83) %>%  
  arrange(change_in_fa, desc(fa_div_cntrl))
fa.cell.pos.top.w.info %>% 
  select(compound_short, compound_full, change_in_fa, fa_div_cntrl)
  compound_short compound_full change_in_fa fa_div_cntrl
1          FApC9        Serine         down    0.7142689
2        FApC124  5-Methyl-THF           up    1.6002449
#write_csv(path = "./results/fa_cell_pos_top_hits_w_FC_pval_sv_edit.csv", fa.cell.pos.top.w.info)
fa.cell.pos.gathered <- fa.cell.pos.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(fa.cell.pos.surr.var) %>% 
  select(Samples, Treatment, S1:S2, starts_with("FApC")) %>% 
  gather(key = "Compound", value = "Abundance", FApC1:FApC99)
fa.cell.pos.nested <- fa.cell.pos.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>% 
  mutate(augment_model = map(model, augment))
fa.cell.pos.modSV.resid <- fa.cell.pos.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  spread(Compound, .resid) 
fa.cell.pos.modSV.resid %>% 
  select(Samples, one_of(fa.cell.pos.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("FApC") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nFA-Only Experiment / Cells / Positive Mode",
    margins = c(50, 50, 75, 30),
    labRow = fa.cell.pos.top.w.info$compound_full,
    k_col = 2, k_row = 2
    )
### PCA - cleaned data ###
fa.cell.pos.modSV.pca <- fa.cell.pos.modSV.resid %>% 
  select(starts_with("FApC")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(fa.cell.pos.modSV.pca$sdev^ 2 * 100 / sum(fa.cell.pos.modSV.pca$sdev ^ 2))

fa.cell.pos.modSV.pca.x <- as.data.frame(fa.cell.pos.modSV.pca$x)
row.names(fa.cell.pos.modSV.pca.x) <- fa.cell.pos.modSV.resid$Samples
fa.cell.pos.modSV.pca.x <- fa.cell.pos.modSV.pca.x %>% 
  bind_cols(fa.cell.pos.modSV.resid %>% select(Treatment))
fa.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (33.7% Var)") +
  ylab("PC2 (20.7% Var)")

fa.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC3 (13.8% Var)") +
  ylab("PC4 (8.1% Var)")

fa.cell.pos.modSV.pca.x %>% 
  ggplot(aes(x = PC5, y = PC6, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC5 (5.1% Var)") +
  ylab("PC6 (4.4% Var)")

4.3 Media / Negative Mode

fa.med.neg.smpl.data <- fa.med.neg.noNA %>% 
    filter(Group == "sample")
fa.med.neg.data.for.sva <- as.matrix(
  fa.med.neg.smpl.data[, which(
    colnames(fa.med.neg.smpl.data) == "FAnM1"
    ):ncol(fa.med.neg.smpl.data)]
  )
row.names(fa.med.neg.data.for.sva) <- fa.med.neg.smpl.data$Samples
fa.med.neg.data.for.sva <- t(fa.med.neg.data.for.sva)
fa.med.neg.data.pheno <- as.data.frame(fa.med.neg.smpl.data[, 5:7])
row.names(fa.med.neg.data.pheno) <- fa.med.neg.smpl.data$Samples
fa.med.neg.mod.fa <- model.matrix(~ as.factor(Treatment), data = fa.med.neg.data.pheno)
fa.med.neg.mod0 <- model.matrix(~ 1, data = fa.med.neg.data.pheno)
set.seed(2018)
num.sv(fa.med.neg.data.for.sva, fa.med.neg.mod.fa, method = "be")
[1] 2
set.seed(2018)
num.sv(fa.med.neg.data.for.sva, fa.med.neg.mod.fa, method = "leek")
[1] 0
set.seed(2018)
fa.med.neg.sv <- sva(fa.med.neg.data.for.sva, fa.med.neg.mod.fa, fa.med.neg.mod0)
Number of significant surrogate variables is:  2 
Iteration (out of 5 ):1  2  3  4  5  
fa.med.neg.surr.var <- as.data.frame(fa.med.neg.sv$sv)
colnames(fa.med.neg.surr.var) <- c("S1", "S2")

fa.med.neg.covar <- fa.med.neg.smpl.pca.x %>% 
  select(Samples, PC1:PC3) %>% 
  inner_join(
    cbind(fa.med.neg.data.pheno, fa.med.neg.surr.var) %>% 
      rownames_to_column("Samples"),
    by = "Samples"
    ) %>% 
  full_join(fa.cell.other.info, by = "Samples") %>% 
  as_tibble()

fa.med.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, exp_plate:S2) %>% 
  gather("surr_var", "value", S1:S2) %>% 
  ggplot(aes(exp_plate, value, color = exp_plate)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1) +
  facet_wrap(~ surr_var, scales = "free")

fa.med.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, Treatment:S2) %>% 
  gather("surr_var", "value", S1:S2) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ surr_var, scales = "free")

fa.med.neg.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples:exp_plate) %>% 
  gather("PC", "value", PC1:PC3) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ PC, scales = "free")

fa.med.neg.covar %>% 
  select(PC1:PC3, S1:S2) %>% 
  ggcorr(label = TRUE)

fa.med.neg.covar %>% 
  select(PC1:PC3, S1:S2) %>% 
  ggpairs()

fa.med.neg.covar %>% 
  select(PC1:PC3, prot_conc_ug_uL:run_order) %>% 
  ggcorr(label = TRUE)

fa.med.neg.covar %>% 
  select(PC1:PC3, prot_conc_ug_uL:run_order) %>% 
  ggpairs()

fa.med.neg.covar %>% 
  ggplot(aes(run_order, PC1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.neg.covar %>% 
  ggplot(aes(run_order, PC3, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC2, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC3, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.neg.covar %>% 
  select(S1:run_order) %>% 
  ggcorr(label = TRUE)

fa.med.neg.covar %>% 
  select(S1:run_order) %>% 
  ggpairs()

fa.med.neg.covar %>% 
  ggplot(aes(run_order, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.neg.covar %>% 
  ggplot(aes(run_order, S2, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.neg.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S2, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

colnames(fa.med.neg.mod.fa) <- c("cntrl", "FAvsCNTRL")
fa.med.neg.d.sv <- cbind(fa.med.neg.mod.fa, fa.med.neg.surr.var)  
head(fa.med.neg.d.sv)
          cntrl FAvsCNTRL          S1         S2
T1_P1_C1      1         0 -0.09995191 -0.1919621
T1_P1_C2      1         0 -0.15623253 -0.1795639
T1_P1_C3      1         0 -0.27423439  0.2990248
T1_P1_FA1     1         1 -0.12610239 -0.1220748
T1_P1_FA2     1         1 -0.12812393 -0.2029587
T1_P1_FA3     1         1 -0.21880953 -0.1672745
fa.med.neg.top.table <- fa.med.neg.data.for.sva %>% 
  lmFit(fa.med.neg.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "FAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(fa.med.neg.data.for.sva))
fa.med.neg.top.w.info <- fa.med.neg.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    fa_div_cntrl = 2 ^ logFC,
    change_in_fa = ifelse(fa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(fa.med.neg.compound.info, by = "compound_short") %>% 
  filter(fa_div_cntrl > 1.2 | fa_div_cntrl < 0.83) %>%  
  arrange(change_in_fa, desc(fa_div_cntrl))
fa.med.neg.top.w.info %>% 
  select(compound_short, compound_full, change_in_fa, fa_div_cntrl)
  compound_short     compound_full change_in_fa fa_div_cntrl
1         FAnM61 Dihydrofolic Acid           up     6.618852
2         FAnM60        Folic Acid           up     4.492412
#write_csv(path = "./results/fa_med_neg_top_hits_w_FC_pval_sv_edit.csv", fa.med.neg.top.w.info)
fa.med.neg.gathered <- fa.med.neg.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(fa.med.neg.surr.var) %>% 
  select(Samples, Treatment, S1:S2, starts_with("FAnM")) %>% 
  gather(key = "Compound", value = "Abundance", FAnM1:FAnM8)
fa.med.neg.nested <- fa.med.neg.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>% 
  mutate(augment_model = map(model, augment))
fa.med.neg.modSV.resid <- fa.med.neg.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  spread(Compound, .resid) 
glimpse(fa.med.neg.modSV.resid[, 1:5])
Observations: 24
Variables: 5
$ Samples   <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_FA1", "T1...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "fa", "fa", "fa", "cntrl"...
$ FAnM1     <dbl> 0.0084053692, -0.0257309954, -0.0513950496, 0.006788...
$ FAnM10    <dbl> -0.002899808, 0.008644288, -0.039545617, -0.01113359...
$ FAnM12    <dbl> -0.027355739, 0.108203107, 0.133830462, -0.040861511...
fa.med.neg.modSV.resid %>% 
  select(Samples, one_of(fa.med.neg.top.w.info$compound_short)) %>% 
  HeatmapPrepAlt("FAnM") %>% 
  t() %>% 
  heatmaply(
    colors = viridis(n = 10, option = "magma"), 
    xlab = "Samples", ylab = "Compounds",
    main = "Statistically significant compounds\nFA-Only Experiment / Media / Negative Mode",
    margins = c(50, 50, 75, 30),
    labRow = fa.med.neg.top.w.info$compound_full,
    k_col = 2, k_row = 2
    )
### PCA - cleaned data ###
fa.med.neg.modSV.pca <- fa.med.neg.modSV.resid %>% 
  select(starts_with("FAnM")) %>% 
  mutate_all(scale, center = TRUE, scale = FALSE) %>% 
  as.matrix() %>% 
  prcomp()
plot(fa.med.neg.modSV.pca$sdev^ 2 * 100 / sum(fa.med.neg.modSV.pca$sdev ^ 2))

fa.med.neg.modSV.pca.x <- as.data.frame(fa.med.neg.modSV.pca$x)
row.names(fa.med.neg.modSV.pca.x) <- fa.med.neg.modSV.resid$Samples
fa.med.neg.modSV.pca.x <- fa.med.neg.modSV.pca.x %>% 
  bind_cols(fa.med.neg.modSV.resid %>% select(Treatment))
fa.med.neg.modSV.pca.x %>% 
  ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
  geom_point(size = 4, alpha = 0.8) +
  xlab("PC1 (63.4% Var)") +
  ylab("PC2 (27.6% Var)")

4.4 Media / Positive Mode

fa.med.pos.smpl.data <- fa.med.pos.noNA %>% 
    filter(Group == "sample")
fa.med.pos.data.for.sva <- as.matrix(
  fa.med.pos.smpl.data[, which(
    colnames(fa.med.pos.smpl.data) == "FApM1"
    ):ncol(fa.med.pos.smpl.data)]
  )
row.names(fa.med.pos.data.for.sva) <- fa.med.pos.smpl.data$Samples
fa.med.pos.data.for.sva <- t(fa.med.pos.data.for.sva)
fa.med.pos.data.pheno <- as.data.frame(fa.med.pos.smpl.data[, 5:7])
row.names(fa.med.pos.data.pheno) <- fa.med.pos.smpl.data$Samples
fa.med.pos.mod.fa <- model.matrix(~ as.factor(Treatment), data = fa.med.pos.data.pheno)
fa.med.pos.mod0 <- model.matrix(~ 1, data = fa.med.pos.data.pheno)
set.seed(2018)
num.sv(fa.med.pos.data.for.sva, fa.med.pos.mod.fa, method = "be")
[1] 1
set.seed(2018)
num.sv(fa.med.pos.data.for.sva, fa.med.pos.mod.fa, method = "leek")
[1] 0
set.seed(2018)
fa.med.pos.sv <- sva(fa.med.pos.data.for.sva, fa.med.pos.mod.fa, fa.med.pos.mod0)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
fa.med.pos.surr.var <- as.data.frame(fa.med.pos.sv$sv)
colnames(fa.med.pos.surr.var) <- c("S1")


fa.med.pos.covar <- fa.med.pos.smpl.pca.x %>% 
  select(Samples, PC1:PC3) %>% 
  inner_join(
    cbind(fa.med.pos.data.pheno, fa.med.pos.surr.var) %>% 
      rownames_to_column("Samples"),
    by = "Samples"
    ) %>% 
  full_join(fa.cell.other.info, by = "Samples") %>% 
  as_tibble()

fa.med.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, exp_plate:S1) %>% 
  gather("surr_var", "value", S1) %>% 
  ggplot(aes(exp_plate, value, color = exp_plate)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1)

fa.med.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples, Treatment:S1) %>% 
  gather("surr_var", "value", S1:S1) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) 

fa.med.pos.covar %>% 
  unite("exp_plate", Experiment, Plate, sep = "_") %>% 
  select(Samples:exp_plate) %>% 
  gather("PC", "value", PC1:PC3) %>% 
  ggplot(aes(exp_plate, value)) +
  geom_boxplot() +
  geom_jitter(size = 2, alpha = 0.8, width = 0.1, aes(color = Treatment)) +
  facet_wrap(~ PC, scales = "free")

fa.med.pos.covar %>% 
  select(PC1:PC3, S1) %>% 
  ggcorr(label = TRUE)

fa.med.pos.covar %>% 
  select(PC1:PC3, S1) %>% 
  ggpairs()

fa.med.pos.covar %>% 
  select(PC1:PC3, prot_conc_ug_uL:run_order) %>% 
  ggcorr(label = TRUE)

fa.med.pos.covar %>% 
  select(PC1:PC3, prot_conc_ug_uL:run_order) %>% 
  ggpairs()

fa.med.pos.covar %>% 
  ggplot(aes(run_order, PC1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC2, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, PC3, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.pos.covar %>% 
  select(S1:run_order) %>% 
  ggcorr(label = TRUE)

fa.med.pos.covar %>% 
  select(S1:run_order) %>% 
  ggpairs()

fa.med.pos.covar %>% 
  ggplot(aes(run_order, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

fa.med.pos.covar %>% 
  ggplot(aes(prot_conc_ug_uL, S1, color = Treatment, shape = Experiment)) +
  geom_point(size = 4, alpha = 0.8)

colnames(fa.med.pos.mod.fa) <- c("cntrl", "FAvsCNTRL")
fa.med.pos.d.sv <- cbind(fa.med.pos.mod.fa, fa.med.pos.surr.var)  
head(fa.med.pos.d.sv)
          cntrl FAvsCNTRL          S1
T1_P1_C1      1         0 -0.17743847
T1_P1_C2      1         0 -0.20473498
T1_P1_C3      1         0 -0.03404904
T1_P1_FA1     1         1 -0.20311683
T1_P1_FA2     1         1 -0.14711451
T1_P1_FA3     1         1 -0.16289143
fa.med.pos.top.table <- fa.med.pos.data.for.sva %>% 
  lmFit(fa.med.pos.d.sv) %>% 
  eBayes() %>% 
  topTable(coef = "FAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(fa.med.pos.data.for.sva))
fa.med.pos.top.w.info <- fa.med.pos.top.table %>% 
  rownames_to_column("compound_short") %>% 
  mutate(
    fa_div_cntrl = 2 ^ logFC,
    change_in_fa = ifelse(fa_div_cntrl < 1, "down", "up")
    ) %>% 
  inner_join(fa.med.pos.compound.info, by = "compound_short") %>% 
  filter(fa_div_cntrl > 1.2 | fa_div_cntrl < 0.83) %>%  
  arrange(change_in_fa, desc(fa_div_cntrl))
fa.med.pos.top.w.info %>% 
  select(compound_short, compound_full, change_in_fa, fa_div_cntrl)
  compound_short compound_full change_in_fa fa_div_cntrl
1         FApM59    Folic Acid           up     4.083554
#write_csv(path = "./results/fa_med_pos_top_hits_w_FC_pval_sv_edit.csv", fa.med.pos.top.w.info)
fa.med.pos.gathered <- fa.med.pos.noNA %>%
  filter(Group == "sample") %>% 
  bind_cols(fa.med.pos.surr.var) %>% 
  select(Samples, Treatment, S1, starts_with("FApM")) %>% 
  gather(key = "Compound", value = "Abundance", FApM1:FApM9)
fa.med.pos.nested <- fa.med.pos.gathered %>% 
  group_by(Compound) %>% 
  nest() %>% 
  mutate(model = map(data, ~lm(Abundance ~ S1, data = .))) %>% 
  mutate(augment_model = map(model, augment))
fa.med.pos.modSV.resid <- fa.med.pos.nested %>% 
  unnest(data, augment_model) %>% 
  select(Samples, Treatment, Compound, .resid) %>% 
  spread(Compound, .resid) 
# only one compound was statistically significant = just plot folic acid
fa.med.pos.modSV.resid %>% 
  select(Samples:Treatment, one_of(fa.med.pos.top.w.info$compound_short)) %>% 
  ggplot(aes(Treatment, FApM59, color = Treatment)) +
  geom_boxplot() +
  geom_jitter(size = 2) +
  ylab("Folic Acid")

5 Pathway Analysis

fa.full.hit.list <- fa.cell.neg.top.w.info %>% 
  mutate(Mode = "cell.neg") %>% 
  bind_rows(
    fa.cell.pos.top.w.info %>% 
      mutate(Mode = "cell.pos")
  ) %>% 
  bind_rows(
    fa.med.neg.top.w.info %>% 
      mutate(Mode = "med.neg")
  ) %>% 
  bind_rows(
    fa.med.pos.top.w.info %>% 
      mutate(Mode = "med.pos")
  ) %>% 
  as.tibble()
fa.full.hit.list %>% 
  select(compound_full:formula, cas_id) %>% 
  distinct() %>% 
  arrange(compound_full)
# A tibble: 6 x 3
  compound_full     formula       cas_id    
  <chr>             <chr>         <chr>     
1 10-Formyl-DHF     C20 H21 N7 O7 28459-40-7
2 5-Methyl-DHF      C20 H23 N7 O6 59904-24-4
3 5-Methyl-THF      C20 H25 N7 O6 134-35-0  
4 Dihydrofolic Acid C19 H21 N7 O6 4033-27-6 
5 Folic Acid        C19 H19 N7 O6 59-30-3   
6 Serine            C3 H7 N O3    56-45-1   
fa.sml.hit.list <- fa.full.hit.list %>% 
  select(compound_short, compound_full, Mode, cas_id)
#write_csv(path = "./results/fa_hit_assignment.csv", fa.sml.hit.list)
fa.plot.order <- read_csv("./data/pathways/fa_hit_assignment.csv") 
Parsed with column specification:
cols(
  compound_short = col_character(),
  compound_full = col_character(),
  Mode = col_character(),
  cas_id = col_character(),
  Source = col_character(),
  Plot_Name = col_character()
)
### Cell Hits Plot ###
fa.cell.hits <- fa.plot.order %>% 
  filter(Source == "C") %>%
  mutate(plot_order = factor(Plot_Name, levels = Plot_Name))
fa.cell.data <- fa.cell.neg.modSV.resid %>% 
  inner_join(fa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>% 
  mutate_at(vars(matches("FA")), scale) %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(fa.cell.hits, by = "compound_short")
fa.cell.sig.plot <- fa.cell.data %>%
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90")
    ) +
  xlab("") +
  ylab("") +
  scale_color_manual(values = c("#56B4E9", "#0072B2"), labels = c("Control", "FA")) +
  ylim(-2.5, 2.5)
fa.cell.sig.plot 

### Media Hits Plot ###
fa.med.hits <- fa.plot.order %>% 
  filter(Source == "M") %>% 
  mutate(plot_order = factor(Plot_Name, levels = Plot_Name))
fa.med.data <- fa.med.neg.modSV.resid %>% 
  inner_join(fa.med.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>% 
  mutate_at(vars(matches("FA")), scale) %>% 
  gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>% 
  inner_join(fa.med.hits, by = "compound_short")
fa.med.sig.plot <- fa.med.data %>%
  ggplot(aes(plot_order, Abundance, color = Treatment)) +
  geom_boxplot(position = position_dodge(0.8)) +
  geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
    panel.background = element_rect(fill = "gray90"),
    legend.justification = "top"
    ) +
  xlab("") +
  ylab("") +
  scale_color_manual(values = c("#56B4E9", "#0072B2"), labels = c("Control", "FA")) +
  ylim(-2.5, 2.5)
fa.med.sig.plot 

### all together
fa.legend <- get_legend(fa.med.sig.plot)

fa.sig.plot.grid <- plot_grid(
  fa.cell.sig.plot  + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")), 
  plot_grid(
    fa.med.sig.plot + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")),
    fa.legend,
    rel_widths = c(1.1, 0.9)
    ),
  ncol = 1, labels = c("A", "B"),
  rel_heights = c(1, 1)
  )
fa.sig.plot.grid

#save_plot("./results/fa_only_exp_sig.png", fa.sig.plot.grid, base_width = 8, base_height = 8)

6 Session Info

sessionInfo()
R version 3.5.1 (2018-07-02)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

Matrix products: default

locale:
[1] LC_COLLATE=English_United States.1252 
[2] LC_CTYPE=English_United States.1252   
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] bindrcpp_0.2.2      ggridges_0.5.1      broom_0.5.0        
 [4] limma_3.38.2        sva_3.30.0          BiocParallel_1.16.0
 [7] genefilter_1.64.0   mgcv_1.8-25         nlme_3.1-137       
[10] heatmaply_0.15.2    viridis_0.5.1       viridisLite_0.3.0  
[13] plotly_4.8.0        GGally_1.4.0        cowplot_0.9.3      
[16] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.8        
[19] purrr_0.2.5         readr_1.1.1         tidyr_0.8.2        
[22] tibble_1.4.2        ggplot2_3.1.0       tidyverse_1.2.1    

loaded via a namespace (and not attached):
  [1] colorspace_1.3-2     class_7.3-14         modeltools_0.2-22   
  [4] mclust_5.4.2         rprojroot_1.3-2      rstudioapi_0.8      
  [7] flexmix_2.3-14       bit64_0.9-7          fansi_0.4.0         
 [10] AnnotationDbi_1.44.0 mvtnorm_1.0-8        lubridate_1.7.4     
 [13] xml2_1.2.0           splines_3.5.1        codetools_0.2-15    
 [16] robustbase_0.93-3    knitr_1.20           jsonlite_1.5        
 [19] annotate_1.60.0      cluster_2.0.7-1      kernlab_0.9-27      
 [22] shiny_1.2.0          compiler_3.5.1       httr_1.3.1          
 [25] backports_1.1.2      assertthat_0.2.0     Matrix_1.2-15       
 [28] lazyeval_0.2.1       cli_1.0.1            later_0.7.5         
 [31] htmltools_0.3.6      tools_3.5.1          gtable_0.2.0        
 [34] glue_1.3.0           reshape2_1.4.3       Rcpp_1.0.0          
 [37] Biobase_2.42.0       cellranger_1.1.0     trimcluster_0.1-2.1 
 [40] gdata_2.18.0         crosstalk_1.0.0      iterators_1.0.10    
 [43] fpc_2.1-11.1         rvest_0.3.2          mime_0.6            
 [46] gtools_3.8.1         XML_3.98-1.16        dendextend_1.9.0    
 [49] DEoptimR_1.0-8       MASS_7.3-51.1        scales_1.0.0        
 [52] TSP_1.1-6            promises_1.0.1       hms_0.4.2           
 [55] parallel_3.5.1       RColorBrewer_1.1-2   yaml_2.2.0          
 [58] memoise_1.1.0        gridExtra_2.3        reshape_0.8.8       
 [61] stringi_1.2.4        RSQLite_2.1.1        gclus_1.3.1         
 [64] S4Vectors_0.20.1     foreach_1.4.4        seriation_1.2-3     
 [67] caTools_1.17.1.1     BiocGenerics_0.28.0  matrixStats_0.54.0  
 [70] rlang_0.3.0.1        pkgconfig_2.0.2      prabclus_2.2-6      
 [73] bitops_1.0-6         evaluate_0.12        lattice_0.20-38     
 [76] bindr_0.1.1          labeling_0.3         htmlwidgets_1.3     
 [79] bit_1.1-14           tidyselect_0.2.5     plyr_1.8.4          
 [82] magrittr_1.5         R6_2.3.0             IRanges_2.16.0      
 [85] gplots_3.0.1         DBI_1.0.0            pillar_1.3.0        
 [88] haven_1.1.2          whisker_0.3-2        withr_2.1.2         
 [91] survival_2.43-1      RCurl_1.95-4.11      nnet_7.3-12         
 [94] modelr_0.1.2         crayon_1.3.4         utf8_1.1.4          
 [97] KernSmooth_2.23-15   rmarkdown_1.10       grid_3.5.1          
[100] readxl_1.1.0         data.table_1.11.8    blob_1.1.1          
[103] digest_0.6.18        diptest_0.75-7       webshot_0.5.1       
[106] xtable_1.8-3         httpuv_1.4.5         stats4_3.5.1        
[109] munsell_0.5.0        registry_0.5